library(tidyverse)
library(nycflights13)
library(ggplot2)
library(readr)
library(here)
- Load the
nycflights13::flights data set. Does it come
formatted as a tibble?
data("flights")
class(flights)
## [1] "tbl_df" "tbl" "data.frame"
- Find all flights that
- Had an arrival delay of two or more hours.
filter(flights, arr_delay >= 120)
## # A tibble: 10,200 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 811 630 101 1047 830
## 2 2013 1 1 848 1835 853 1001 1950
## 3 2013 1 1 957 733 144 1056 853
## 4 2013 1 1 1114 900 134 1447 1222
## 5 2013 1 1 1505 1310 115 1638 1431
## 6 2013 1 1 1525 1340 105 1831 1626
## 7 2013 1 1 1549 1445 64 1912 1656
## 8 2013 1 1 1558 1359 119 1718 1515
## 9 2013 1 1 1732 1630 62 2028 1825
## 10 2013 1 1 1803 1620 103 2008 1750
## # ... with 10,190 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
- Flew to Houston (IAH or HOU)
filter(flights, dest %in% c("IAH", "HOU"))
## # A tibble: 9,313 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 623 627 -4 933 932
## 4 2013 1 1 728 732 -4 1041 1038
## 5 2013 1 1 739 739 0 1104 1038
## 6 2013 1 1 908 908 0 1228 1219
## 7 2013 1 1 1028 1026 2 1350 1339
## 8 2013 1 1 1044 1045 -1 1352 1351
## 9 2013 1 1 1114 900 134 1447 1222
## 10 2013 1 1 1205 1200 5 1503 1505
## # ... with 9,303 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
- Were delayed by at least an hour, but made up over 30 minutes in
flight.
filter(flights, dep_delay >= 60 & arr_delay <= dep_delay - 30)
## # A tibble: 2,074 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 1716 1545 91 2140 2039
## 2 2013 1 1 2205 1720 285 46 2040
## 3 2013 1 1 2326 2130 116 131 18
## 4 2013 1 3 1503 1221 162 1803 1555
## 5 2013 1 3 1821 1530 171 2131 1910
## 6 2013 1 3 1839 1700 99 2056 1950
## 7 2013 1 3 1850 1745 65 2148 2120
## 8 2013 1 3 1923 1815 68 2036 1958
## 9 2013 1 3 1941 1759 102 2246 2139
## 10 2013 1 3 1950 1845 65 2228 2227
## # ... with 2,064 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
- Departed between midnight and 6am.
filter(flights , between(dep_time, 0, 600) | dep_time == 2400)
## # A tibble: 9,373 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # ... with 9,363 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
- Which flights took the longest and which the shortest?
slice(flights, which.max(air_time))
## # A tibble: 1 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 3 17 1337 1335 2 1937 1836
## # ... with 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
slice_max(flights, air_time)
## # A tibble: 1 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 3 17 1337 1335 2 1937 1836
## # ... with 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
slice(flights, which.min(air_time))
## # A tibble: 1 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 16 1355 1315 40 1442 1411
## # ... with 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
dplyr::slice_min(flights, air_time)
## # A tibble: 2 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 16 1355 1315 40 1442 1411
## 2 2013 4 13 537 527 10 622 628
## # ... with 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
- Brainstorm as many ways as possible to select
dep_time,
dep_delay, arr_time, and
arr_delay.
selection <- c("dep_time", "dep_delay", "arr_time", "arr_delay")
flights[ ,selection]
## # A tibble: 336,776 x 4
## dep_time dep_delay arr_time arr_delay
## <int> <dbl> <int> <dbl>
## 1 517 2 830 11
## 2 533 4 850 20
## 3 542 2 923 33
## 4 544 -1 1004 -18
## 5 554 -6 812 -25
## 6 554 -4 740 12
## 7 555 -5 913 19
## 8 557 -3 709 -14
## 9 557 -3 838 -8
## 10 558 -2 753 8
## # ... with 336,766 more rows
flights %>% select(selection)
## # A tibble: 336,776 x 4
## dep_time dep_delay arr_time arr_delay
## <int> <dbl> <int> <dbl>
## 1 517 2 830 11
## 2 533 4 850 20
## 3 542 2 923 33
## 4 544 -1 1004 -18
## 5 554 -6 812 -25
## 6 554 -4 740 12
## 7 555 -5 913 19
## 8 557 -3 709 -14
## 9 557 -3 838 -8
## 10 558 -2 753 8
## # ... with 336,766 more rows
- Create a new variable
first_flight which takes the
value TRUE if it is the first flight of the day.
flights <- mutate(flights, first_flight = if_else(day - lag(day) != 0 | is.na(day - lag(day)), TRUE, FALSE ))
filter(flights, first_flight == TRUE)
## # A tibble: 365 x 20
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 2 42 2359 43 518 442
## 3 2013 1 3 32 2359 33 504 442
## 4 2013 1 4 25 2359 26 505 442
## 5 2013 1 5 14 2359 15 503 445
## 6 2013 1 6 16 2359 17 451 442
## 7 2013 1 7 49 2359 50 531 444
## 8 2013 1 8 454 500 -6 625 648
## 9 2013 1 9 2 2359 3 432 444
## 10 2013 1 10 3 2359 4 426 437
## # ... with 355 more rows, and 12 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>,
## # first_flight <lgl>
flights %>%
group_by(year, month, day) %>%
mutate(first_flight = if_else(row_number() == 1, TRUE, FALSE)) %>%
select(first_flight)
## # A tibble: 336,776 x 4
## # Groups: year, month, day [365]
## year month day first_flight
## <int> <int> <int> <lgl>
## 1 2013 1 1 TRUE
## 2 2013 1 1 FALSE
## 3 2013 1 1 FALSE
## 4 2013 1 1 FALSE
## 5 2013 1 1 FALSE
## 6 2013 1 1 FALSE
## 7 2013 1 1 FALSE
## 8 2013 1 1 FALSE
## 9 2013 1 1 FALSE
## 10 2013 1 1 FALSE
## # ... with 336,766 more rows
- Find out in which district the most incidents were reported.
crime <- readr::read_csv(here::here('SoSe_2022/datasets/BostonCrime.csv'))
crime %>%
group_by(DISTRICT) %>%
summarise(N = n()) %>%
arrange(desc(N))
## # A tibble: 13 x 2
## DISTRICT N
## <chr> <int>
## 1 B2 15658
## 2 C11 12957
## 3 D4 12454
## 4 B3 11560
## 5 A1 10976
## 6 C6 7516
## 7 D14 6159
## 8 E18 5517
## 9 E13 5480
## 10 E5 4095
## 11 A7 3800
## 12 A15 2034
## 13 <NA> 682
- Create an ordered list of the top ten districts according to the
number of drug violations. Start with the district with most drug
violations.
crime %>%
filter(OFFENSE_CODE_GROUP == "Drug Violation") %>%
group_by(DISTRICT) %>%
summarise(N = n()) %>%
arrange(desc(N))
## # A tibble: 13 x 2
## DISTRICT N
## <chr> <int>
## 1 B2 849
## 2 B3 695
## 3 C6 599
## 4 A1 518
## 5 C11 468
## 6 D4 455
## 7 E13 280
## 8 D14 222
## 9 A7 203
## 10 E18 174
## 11 E5 139
## 12 A15 118
## 13 <NA> 57
- What was the most dangerous hour of the week according to the number
of homicides (
OFFENSE_CODE_GROUP == ‘Homicide’)?
crime %>%
filter(OFFENSE_CODE_GROUP == "Homicide") %>%
group_by(HOUR, DAY_OF_WEEK) %>%
summarise(N = n()) %>%
arrange(desc(N))
## # A tibble: 46 x 3
## # Groups: HOUR [20]
## HOUR DAY_OF_WEEK N
## <dbl> <chr> <int>
## 1 0 Thursday 5
## 2 9 Tuesday 3
## 3 21 Friday 3
## 4 21 Monday 3
## 5 21 Saturday 3
## 6 22 Tuesday 3
## 7 0 Saturday 2
## 8 4 Sunday 2
## 9 13 Saturday 2
## 10 14 Thursday 2
## # ... with 36 more rows
- Calculate for each district the proportion of motor vehicle
accidents to all reported incidents (
OFFENSE_CODE_GROUP == ‘Motor
Vehicle Accident Response’).
crime %>%
group_by(DISTRICT) %>%
transmute(prop_accidents = sum(OFFENSE_CODE_GROUP == "Motor Vehicle Accident Response") / n())
## # A tibble: 98,888 x 2
## # Groups: DISTRICT [13]
## DISTRICT prop_accidents
## <chr> <dbl>
## 1 B3 0.102
## 2 E18 0.134
## 3 E5 0.139
## 4 A1 0.0867
## 5 D4 0.0926
## 6 E5 0.139
## 7 B3 0.102
## 8 B2 0.125
## 9 D4 0.0926
## 10 C11 0.127
## # ... with 98,878 more rows
- Sort the data set chronologically by year and month. Then take the
first 10.000 observations and compute the proportion of incidents where
a shooting was involved for each district. Now take a random sample of
size 10.000. Is there a noteworthy difference? Hint: There is a
variable called
SHOOTING.
crime %>%
arrange(desc(YEAR), desc(MONTH)) %>%
slice(1:10000) %>%
mutate(SHOOTING = ifelse(is.na(SHOOTING), 0, SHOOTING)) %>%
summarise(porpotion = mean(SHOOTING == "Y"))
## # A tibble: 1 x 1
## porpotion
## <dbl>
## 1 0.0027
set.seed(1245)
crime %>%
slice_sample(n = 10000) %>%
mutate(SHOOTING = ifelse(is.na(SHOOTING), 0, SHOOTING)) %>%
summarise(porpotion = mean(SHOOTING == "Y"))
## # A tibble: 1 x 1
## porpotion
## <dbl>
## 1 0.0043
- Compute the TB rate for table2, table4a and table4b.
table2[table2$type == "cases", "count"] / table2[table2$type == "population", "count"]
## count
## 1 0.0000372741
## 2 0.0001294466
## 3 0.0002193930
## 4 0.0004612363
## 5 0.0001667495
## 6 0.0001669488
table4a[ ,-1] / table4b[ ,-1]
## 1999 2000
## 1 0.0000372741 0.0001294466
## 2 0.0002193930 0.0004612363
## 3 0.0001667495 0.0001669488
- Recreate the plot given by the code below using
table2
instead of table1.
ggplot(table1, aes(year, cases)) +
geom_line(aes(group = country), colour = "grey50") +
geom_point(aes(colour = country))

table2 %>%
filter(type == "cases") %>%
ggplot( aes(x = year, y = count)) +
geom_line(aes( group = country)) +
geom_point(aes(color = country))
